In [73]:
import os
from pathlib import Path
import json
import linecache
import functools

import pandas as pd
import geopandas as gpd 
import matplotlib.pyplot as plt
import folium 
from shapely.ops import nearest_points
from shapely.geometry import LineString
In [3]:
DATA_PATH = Path('/data/safegraph/safegraph_open_census_data')
PREPROCESSED_DATA_PATH = Path('../../../data/preprocessed/safegraph/safegraph_open_census_data')
In [100]:
# Only needs to be run once
county_fips_code = '45039'  # Fairfield County, South Carolina

!mkdir -p {PREPROCESSED_DATA_PATH}
census_data_file_names = !ls {DATA_PATH}/data/ | grep [0-9] #  | cut -f 1 -d .  # eliminate .csv suffix
county_directory = PREPROCESSED_DATA_PATH / "data/county" / county_fips_code
!mkdir -p {county_directory}
for file_name in census_data_file_names:
    !touch {county_directory}/{file_name}
    print(county_directory/file_name)
    !head -n 1 "{DATA_PATH}/data/{file_name}" > {county_directory}/{file_name}
    !cat "{DATA_PATH}/data/{file_name}" | grep ^{county_fips_code}.*$ >> {county_directory}/{file_name}
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b00.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b01.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b02.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b03.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b07.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b08.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b09.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b11.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b12.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b14.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b15.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b19.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b20.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b21.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b22.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b23.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b25.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b27.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_b99.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_c16.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_c17.csv
../../../data/preprocessed/safegraph/safegraph_open_census_data/data/county/45039/cbg_c24.csv
In [113]:
# See: https://www.safegraph.com/blog/beginners-guide-to-census
    
table_ids = [
    'B01001e1',   # SEX BY AGE: Total: Total population -- (Estimate),Sex By Age, Total, Total Population -- (Estimate),,,,,
    #'B00001e1',   # UNWEIGHTED SAMPLE COUNT OF THE POPULATION: Total: Total population -- (Estimate),Unweighted Sample Count Of The Population, Total, Total Population -- (Estimate),,,,,
    #'B00001m1',   # UNWEIGHTED SAMPLE COUNT OF THE POPULATION: Total: Total population -- (Margin of Error),Unweighted Sample Count Of The Population, Total, Total Population -- (Margin Of Error),,,,,
    'B19013e1',   # Median Household Income
    #'B00002e1',   # UNWEIGHTED SAMPLE HOUSING UNITS: Total: Housing units -- (Estimate),Unweighted Sample Housing Units, Total, Housing Units -- (Estimate),,,,,
    #'B00002m1',   # UNWEIGHTED SAMPLE HOUSING UNITS: Total: Housing units -- (Margin of Error),Unweighted Sample Housing Units, Total, Housing Units -- (Margin Of Error),,,,,
    'B25001e1',   # HOUSING UNITS: Total: Housing units -- (Estimate),Housing Units, Total, Housing Units -- (Estimate),,,,,
    #'B25001m1',   # HOUSING UNITS: Total: Housing units -- (Margin of Error),Housing Units, Total, Housing Units -- (Margin Of Error),,,,,
]

cbg_field_desc = pd.read_csv(DATA_PATH / 'metadata/cbg_field_descriptions.csv')
cbg_field_desc[cbg_field_desc.table_id.isin(table_ids)]
Out[113]:
table_id field_full_name field_level_1 field_level_2 field_level_3 field_level_4 field_level_5 field_level_6 field_level_7 field_level_8
4 B01001e1 SEX BY AGE: Total: Total population -- (Estimate) Sex By Age Total Total Population -- (Estimate) NaN NaN NaN NaN NaN
3094 B19013e1 MEDIAN HOUSEHOLD INCOME IN THE PAST 12 MONTHS ... Median Household Income In The Past 12 Months ... Total Households -- (Estimate) NaN NaN NaN NaN NaN
4394 B25001e1 HOUSING UNITS: Total: Housing units -- (Estimate) Housing Units Total Housing Units -- (Estimate) NaN NaN NaN NaN NaN
In [114]:
county_files = ['cbg_b00.csv', 'cbg_b01.csv', 'cbg_b19.csv', 'cbg_b25.csv'] #  !ls {county_directory}

dfs = [pd.read_csv(county_directory / file, dtype={'census_block_group': str}) for file in county_files]
merged = dfs[0]
for df in dfs[1:]:
    merged = pd.merge(merged, df, on=['census_block_group'])
cbg_data = merged
    
#combiner = lambda left, right: pd.merge(left, right, on=['census_block_group'])
#cbg_data = functools.reduce(combiner, dfs[0], dfs[1:])


'''cbg_b19 = pd.read_csv(county_directory / 'cbg_b19.csv', dtype={'census_block_group': str})
cbg_b01 = pd.read_csv(county_directory / 'cbg_b01.csv', dtype={'census_block_group': str})
cbg_data = pd.merge(cbg_b01, cbg_b19, on=['census_block_group'])'''
cbg_data = cbg_data[['census_block_group'] + table_ids]
#cbg_data.dropna().head()
cbg_data
Out[114]:
census_block_group B01001e1 B19013e1 B25001e1
0 450399601001 1101 29250.0 624
1 450399601002 931 39931.0 523
2 450399602001 1015 46898.0 517
3 450399602002 1260 30481.0 782
4 450399602003 1467 48125.0 642
5 450399603001 1046 68902.0 1079
6 450399603002 890 NaN 449
7 450399603003 1482 30625.0 678
8 450399603004 2257 31346.0 982
9 450399604001 962 26250.0 451
10 450399604002 822 NaN 462
11 450399604003 660 70050.0 465
12 450399604004 815 21024.0 550
13 450399604005 2115 20601.0 938
14 450399605001 2015 57375.0 609
15 450399605002 1236 48722.0 538
16 450399605003 1596 33125.0 684
17 450399605004 1355 31306.0 689
In [117]:
sum(cbg_data['B01001e1'])  # total population
Out[117]:
23025
In [115]:
cbg_data['household_size'] = cbg_data['B01001e1'] / cbg_data['B25001e1']
<ipython-input-115-a1e05c5d3218>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbg_data['household_size'] = cbg_data['B01001e1'] / cbg_data['B25001e1']
In [118]:
cbg_data
Out[118]:
census_block_group B01001e1 B19013e1 B25001e1 household_size
0 450399601001 1101 29250.0 624 1.764423
1 450399601002 931 39931.0 523 1.780115
2 450399602001 1015 46898.0 517 1.963250
3 450399602002 1260 30481.0 782 1.611253
4 450399602003 1467 48125.0 642 2.285047
5 450399603001 1046 68902.0 1079 0.969416
6 450399603002 890 NaN 449 1.982183
7 450399603003 1482 30625.0 678 2.185841
8 450399603004 2257 31346.0 982 2.298371
9 450399604001 962 26250.0 451 2.133038
10 450399604002 822 NaN 462 1.779221
11 450399604003 660 70050.0 465 1.419355
12 450399604004 815 21024.0 550 1.481818
13 450399604005 2115 20601.0 938 2.254797
14 450399605001 2015 57375.0 609 3.308703
15 450399605002 1236 48722.0 538 2.297398
16 450399605003 1596 33125.0 684 2.333333
17 450399605004 1355 31306.0 689 1.966618
In [11]:
# Census Block Groups
In [16]:
def geojson_for_county(state_abbreviation="SC",
                       county_name="Fairfield County",
                       geojson_path=DATA_PATH / "geometry/cbg.geojson"):
    header = !head -n 5 {geojson_path}
    footer = !tail -n 2 {geojson_path}
    
    # lines to search file for county of interest.
    # must be found by inspection using "tail | head" method below, and checking whether 
    # the state of interest is included.
    # If not included, search up or down via binary search (file is sorted by state)
    # TODO: write the binary search explicitly here, if we need to generalize to other states/counties
    line_start_search = 170000
    line_end_search = 180000
    num_lines = line_end_search - line_start_search
    
    # stream = os.popen(f"""< {geojson_path} tail -n +{line_start_search} | head -n {num_lines} | grep '"State": "{state_abbreviation}", "County": "{county_name}"'  """)
    #stream = os.popen(f"""cat {geojson_path} | tail -n +{line_start_search} | head -n {num_lines} | grep '"State": "{state_abbreviation}", "County": "{county_name}"'  """)
    #county_cbgs = stream.readlines()
    
    county_cbgs = [linecache.getline(str(geojson_path), line_number).strip() for line_number in range(line_start_search, line_end_search)]
    county_cbgs = [line for line in county_cbgs if f'"State": "{state_abbreviation}", "County": "{county_name}"' in line]
    
    print(len(county_cbgs))
    
    # remove final character from last entry in list:
    # a trailing "," that will mess up the JSON parsing
    if county_cbgs[-1][-1] == ',':
        county_cbgs[-1] = county_cbgs[-1][:-1]
    
    return json.loads('\n'.join(header + county_cbgs + footer))
    
In [17]:
# only needs to be run once
geojson = geojson_for_county()
18
In [20]:
#len([f['properties']['CensusBlockGroup'] for f in geojson['features']])
#[f['properties']['CensusBlockGroup'] for f in cbgs_json['features']]
Out[20]:
18
In [24]:
#geojson['features'][0]['properties']
Out[24]:
{'StateFIPS': '45',
 'CountyFIPS': '039',
 'TractCode': '960300',
 'BlockGroup': '2',
 'CensusBlockGroup': '450399603002',
 'State': 'SC',
 'County': 'Fairfield County',
 'ClassCode': 'H1'}
In [120]:
m = folium.Map(location=[34.4, -81.1], zoom_start=10.5)

folium.Choropleth(
    geo_data=geojson,
    name='Median Household Income',
    data=cbg_data,
    columns=['census_block_group', 'B19013e1'], # B01001e1, # ['State', 'Unemployment'],
    key_on='feature.properties.CensusBlockGroup',
    fill_color='Greens',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Median Household Income ($)'
).add_to(m)

folium.Choropleth(
    geo_data=geojson,
    name='Population',
    data=cbg_data,
    columns=['census_block_group', 'B01001e1'], # , # ['State', 'Unemployment'],
    key_on='feature.properties.CensusBlockGroup',
    fill_color='Blues',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Population (# of people)'
).add_to(m)

folium.Choropleth(
    geo_data=geojson,
    name='Household Size',
    data=cbg_data,
    columns=['census_block_group', 'household_size'], # , # ['State', 'Unemployment'],
    key_on='feature.properties.CensusBlockGroup',
    fill_color='Reds',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Household Size (# of people)'
).add_to(m)

folium.LayerControl().add_to(m)

m
Out[120]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: